2.0 CCLE Gene Expression Signature

In [1]:
from clustergrammer2 import net
clustergrammer2 backend version 0.2.9
In [2]:
import clustergrammer_groupby as cby
import warnings
warnings.filterwarnings('ignore')
In [3]:
import numpy as np
import pandas as pd
df = {}
df['ccle'] = pd.read_csv('../data/CCLE/CCLE.txt.gz', compression='gzip', index_col=0)
In [4]:
net.load_df(df['ccle'])
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['var'] = net.export_df().round(2)
genes_top_var = df['var'].index.tolist()
In [5]:
net.load_df(df['var'])
net.normalize(axis='row', norm_type='zscore')
df_tmp = net.export_df().round(2)
net.enrichrgram('GO_Biological_Process_2018')
df['enrichrgram'] = net.export_df()
net.widget()
In [6]:
df['enrichrgram'].index.tolist()[0]
Out[6]:
('KRT19',
 'extracellular matrix organization (GO:0030198): False<p> Pval 9.171026638093131e-29</p>',
 'collagen fibril organization (GO:0030199): False<p> Pval 1.9897240254393085e-12</p>',
 'regulation of cell migration (GO:0030334): False<p> Pval 6.6999547584764606e-15</p>',
 'positive regulation of intracellular signal transduction (GO:1902533): False<p> Pval 5.5852790328680665e-12</p>',
 'negative regulation of cellular process (GO:0048523): False<p> Pval 1.205623445289199e-13</p>',
 'negative regulation of cell proliferation (GO:0008285): False<p> Pval 2.3244136779160615e-10</p>',
 'neutrophil mediated immunity (GO:0002446): False<p> Pval 7.30336780471453e-10</p>',
 'cytokine-mediated signaling pathway (GO:0019221): False<p> Pval 1.2455747717025743e-13</p>',
 'neutrophil degranulation (GO:0043312): False<p> Pval 2.8513592127176324e-09</p>',
 'regulation of angiogenesis (GO:0045765): False<p> Pval 2.3387050175471637e-10</p>')
In [7]:
df['var'].head()
Out[7]:
('cell line: LN18', 'tissue: central_nervous_system', 'histology: glioma', 'sub-histology: astrocytoma_Grade_IV', 'gender: M') ('cell line: 769P', 'tissue: kidney', 'histology: carcinoma', 'sub-histology: clear_cell_renal_cell_carcinoma', 'gender: F') ('cell line: 786O', 'tissue: kidney', 'histology: carcinoma', 'sub-histology: clear_cell_renal_cell_carcinoma', 'gender: M') ('cell line: CAOV3', 'tissue: ovary', 'histology: carcinoma', 'sub-histology: adenocarcinoma', 'gender: F') ('cell line: HEPG2', 'tissue: liver', 'histology: carcinoma', 'sub-histology: hepatocellular_carcinoma', 'gender: M') ('cell line: MOLT4', 'tissue: haematopoietic_and_lymphoid_tissue', 'histology: lymphoid_neoplasm', 'sub-histology: acute_lymphoblastic_T_cell_leukaemia', 'gender: M') ('cell line: NCIH524', 'tissue: lung', 'histology: carcinoma', 'sub-histology: small_cell_carcinoma', 'gender: M') ('cell line: NCIH209', 'tissue: lung', 'histology: carcinoma', 'sub-histology: small_cell_carcinoma', 'gender: M') ('cell line: MIAPACA2', 'tissue: pancreas', 'histology: carcinoma', 'sub-histology: ductal_carcinoma', 'gender: M') ('cell line: MCAS', 'tissue: ovary', 'histology: carcinoma', 'sub-histology: adenocarcinoma', 'gender: F') ... ('cell line: SLR21', 'tissue: kidney', 'histology: carcinoma', 'sub-histology: renal_cell_carcinoma', 'gender: NA') ('cell line: LNZ308', 'tissue: central_nervous_system', 'histology: glioma', 'sub-histology: astrocytoma_Grade_IV', 'gender: NA') ('cell line: LN340', 'tissue: central_nervous_system', 'histology: glioma', 'sub-histology: astrocytoma_Grade_IV', 'gender: NA') ('cell line: HCC827GR5', 'tissue: lung', 'histology: carcinoma', 'sub-histology: adenocarcinoma', 'gender: NA') ('cell line: SLR20', 'tissue: kidney', 'histology: carcinoma', 'sub-histology: renal_cell_carcinoma', 'gender: NA') ('cell line: HK2', 'tissue: kidney', 'histology: other', 'sub-histology: immortalized_epithelial', 'gender: NA') ('cell line: EW8', 'tissue: bone', 'histology: Ewings_sarcoma-peripheral_primitive_neuroectodermal_tumour', 'sub-histology: NS', 'gender: NA') ('cell line: UOK101', 'tissue: kidney', 'histology: carcinoma', 'sub-histology: clear_cell_renal_cell_carcinoma', 'gender: NA') ('cell line: JHESOAD1', 'tissue: oesophagus', 'histology: carcinoma', 'sub-histology: barrett_associated_adenocarcinoma', 'gender: NA') ('cell line: CH157MN', 'tissue: central_nervous_system', 'histology: meningioma', 'sub-histology: NS', 'gender: NA')
KRT19 -0.66 -0.99 0.69 0.96 0.75 -1.06 -1.04 0.65 0.89 1.29 ... 1.13 -1.02 -0.73 1.33 -1.07 0.91 -0.98 -1.06 1.33 -0.68
EPCAM -1.08 -1.13 -0.43 1.07 0.83 -1.03 0.51 1.06 -1.10 1.08 ... 0.85 -1.12 -1.16 1.24 -0.46 0.75 -1.08 -0.17 1.28 -1.06
TACSTD2 -0.83 -0.83 -0.82 1.68 -0.59 -0.75 -0.76 -0.72 -0.71 1.38 ... 0.96 -0.79 -0.89 1.56 0.91 -0.77 -0.75 -0.79 1.68 -0.81
MAL2 -1.10 0.92 0.14 1.09 0.81 -1.03 0.37 0.69 -0.98 1.28 ... 0.93 -1.13 -1.11 1.17 -1.07 0.65 -0.76 -0.77 1.39 -0.88
TGFBI 0.84 0.78 1.27 0.48 0.94 -1.28 -1.00 -1.22 -0.46 0.42 ... 1.15 1.06 1.22 0.67 -0.88 0.82 -1.30 1.14 0.50 1.09

5 rows × 1037 columns

In [8]:
# enrichrgram_row = df_enr.index.tolist()[0][1:]
# enrichrgram_row = [x.split(': ')[0] + ' '+  x.split('Pval')[1] for x in enrichrgram_row]
# enrichrgram_row
In [10]:
from copy import deepcopy
In [11]:
from ast import literal_eval as make_tuple
cols = df['var'].columns.tolist()
new_cols = [make_tuple(x) for x in cols]
df['var-cat'] = deepcopy(df['var'])
df['var-cat'].columns = new_cols
In [12]:
all_genes = df['ccle'].index.tolist()
len(all_genes)
Out[12]:
18874

CCLE Gene Expression Data, Z-score Genes

In [13]:
new_rows = [(x,) for x in df['var-cat'].index.tolist()]
df['var-tuple'] = deepcopy(df['var-cat'])
df['var-tuple'].index = new_rows
net.load_df(df['var-tuple'])
net.normalize(axis='row', norm_type='zscore')
df_tmp = net.export_df().round(2)
net.load_df(df_tmp)
net.cluster()
net.dendro_cats(axis='row', dendro_level=5)
df['dendro'] = net.export_df()
net.widget()

Select gene list of interest

In [14]:
rows = df['dendro'].index.tolist()
immune_genes = sorted([x[0] for x in rows if x[1] == 'Group 5: cat-5'])
len(immune_genes)
Out[14]:
121
In [15]:
rows = df['dendro'].index.tolist()
cns_genes = sorted([x[0] for x in rows if x[1] == 'Group 5: cat-9'])
len(cns_genes)
Out[15]:
113
In [16]:
from glob import glob
all_files = glob('../data/Enrichr_Libraries_of_Interest/*.txt')
all_files
Out[16]:
['../data/Enrichr_Libraries_of_Interest/Disease_Perturbations_from_GEO_up.txt',
 '../data/Enrichr_Libraries_of_Interest/GO_Molecular_Function_2018.txt',
 '../data/Enrichr_Libraries_of_Interest/ARCHS4_TFs_Coexp.txt',
 '../data/Enrichr_Libraries_of_Interest/HomoloGene.txt',
 '../data/Enrichr_Libraries_of_Interest/Reactome_2016.txt',
 '../data/Enrichr_Libraries_of_Interest/WikiPathways_2016.txt',
 '../data/Enrichr_Libraries_of_Interest/ARCHS4_Tissues.txt',
 '../data/Enrichr_Libraries_of_Interest/PPI_Hub_Proteins.txt',
 '../data/Enrichr_Libraries_of_Interest/ARCHS4_Kinases_Coexp.txt',
 '../data/Enrichr_Libraries_of_Interest/KEGG_2016.txt',
 '../data/Enrichr_Libraries_of_Interest/GO_Biological_Process_2018.txt',
 '../data/Enrichr_Libraries_of_Interest/MGI_Mammalian_Phenotype_Level_3.txt',
 '../data/Enrichr_Libraries_of_Interest/MGI_Mammalian_Phenotype_Level_4.txt',
 '../data/Enrichr_Libraries_of_Interest/Single_Gene_Perturbations_from_GEO_up.txt',
 '../data/Enrichr_Libraries_of_Interest/KEA_2015.txt',
 '../data/Enrichr_Libraries_of_Interest/MGI_Mammalian_Phenotype_2017.txt',
 '../data/Enrichr_Libraries_of_Interest/Panther_2016.txt',
 '../data/Enrichr_Libraries_of_Interest/ChEA_2016.txt',
 '../data/Enrichr_Libraries_of_Interest/ARCHS4_IDG_Coexp.txt',
 '../data/Enrichr_Libraries_of_Interest/ARCHS4_Cell-lines.txt']
In [17]:
gmts = {}
for inst_file in all_files:
    inst_lib = inst_file.split('/')[-1].replace('.txt','')
    gmts[inst_lib] = net.load_gmt(inst_file)
    print(inst_lib)
Disease_Perturbations_from_GEO_up
GO_Molecular_Function_2018
ARCHS4_TFs_Coexp
HomoloGene
Reactome_2016
WikiPathways_2016
ARCHS4_Tissues
PPI_Hub_Proteins
ARCHS4_Kinases_Coexp
KEGG_2016
GO_Biological_Process_2018
MGI_Mammalian_Phenotype_Level_3
MGI_Mammalian_Phenotype_Level_4
Single_Gene_Perturbations_from_GEO_up
KEA_2015
MGI_Mammalian_Phenotype_2017
Panther_2016
ChEA_2016
ARCHS4_IDG_Coexp
ARCHS4_Cell-lines
In [18]:
from scipy.stats import binom_test
def enrich_gene_list_using_lib(libs, lib_name, gene_list, background_list, pval_cutoff=0.05):
        
    lib_json = libs[lib_name]
    len_gene_list = len(gene_list)

    list_terms = []
    list_pval = []
    # list of series that will be used to make dataframe
    list_term_ser = []
    
    for inst_term in lib_json:

        term_ser = pd.Series(data=np.zeros(len(gene_list)), index=gene_list)

        term_genes = lib_json[inst_term]

        p_expect = len(set(all_genes).intersection(term_genes))/len(all_genes)

        # print(gene_list)

        found_genes = list(set(gene_list).intersection(term_genes))

        # print('found_genes', len(found_genes))
        actual_k = len(found_genes)

        # set found genes to one
        term_ser[found_genes] =  1

        if actual_k/len_gene_list > p_expect:
            inst_pval = binom_test(actual_k, len_gene_list, p_expect)
        else:
            inst_pval = 0.5

        # print('HERE', inst_pval, actual_k, len_gene_list, p_expect)

        if inst_pval < pval_cutoff:

            term_name = (inst_term, 'Library: ' + lib_name, 'Pval: ' + str(inst_pval))

            list_terms.append(term_name)
            list_pval.append(inst_pval)

            term_ser.name = term_name

            list_term_ser.append(term_ser)

    ser_pval = pd.Series(data=list_pval, index=list_terms).sort_values()
    # rank df_enr by pval
    df_enr = pd.concat(list_term_ser, axis=1)[ser_pval.index.tolist()]
    
    return ser_pval, df_enr
In [19]:
libraries_of_intersest = ['GO_Biological_Process_2018', 
                          'MGI_Mammalian_Phenotype_2017', 'ChEA_2016',
                          'Disease_Perturbations_from_GEO_up', 'ARCHS4_TFs_Coexp',
                          'ARCHS4_Tissues']

keep_num_terms = 5
In [20]:
inst_genes = immune_genes
df_list = []
for inst_lib in libraries_of_intersest:
    ser_pval, df_enr = enrich_gene_list_using_lib(gmts, inst_lib, inst_genes, all_genes)    
    
    df_list.append(df_enr.iloc[:, :keep_num_terms])
    
df['enr-immune'] = pd.concat(df_list, axis=1)
df['enr-immune'].shape
Out[20]:
(121, 30)

Immune Gene Enrichment

In [21]:
net.load_df(df['enr-immune'])
net.filter_N_top(inst_rc='row', N_top=500, rank_type='sum')
net.cluster(dist_type='jaccard')
net.widget()
In [22]:
inst_genes = cns_genes
df_list = []
for inst_lib in libraries_of_intersest:
    ser_pval, df_enr = enrich_gene_list_using_lib(gmts, inst_lib, inst_genes, all_genes)    
    
    df_list.append(df_enr.iloc[:, :keep_num_terms])
    
df['enr-cns'] = pd.concat(df_list, axis=1)
df['enr-cns'].shape
Out[22]:
(113, 30)

CNS Gene Enrichment

In [23]:
net.load_df(df['enr-cns'])
net.filter_N_top(inst_rc='row', N_top=500, rank_type='sum')
net.cluster(dist_type='jaccard')
net.widget()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
net.load_df(df_term.loc[keep_rows])
net.swap_nan_for_zero()
net.widget()
In [ ]:
 
In [ ]:
from copy import deepcopy
In [ ]:
def enrich_dataframe_using_lib(lib_json, df_ini, background_list, num_top_terms, pval_cutoff=0.05):

    gene_list = df_ini.index.tolist()
    
    ser_pval, df_term = enrich_gene_list_using_lib(lib_json, gene_list, background_list, 
                                                   pval_cutoff=pval_cutoff)
    
    keep_terms = ser_pval.index.tolist()[:num_top_terms]

    df_term = df_term[keep_terms]
    
    # drop pval, going to use the gmt json to add categories to rows
    keep_term_names = [x[0] for x in keep_terms]
    
    rows_ini = df_ini.index.tolist()
    
    new_rows = []
    for inst_gene in rows_ini:
        new_row = (inst_gene,)
    
        for inst_term in keep_terms:
            
            inst_term_name = inst_term[0]
            inst_term_pval = inst_term[1]
            
            term_list = lib_json[inst_term_name]
            
            inst_found = 'False'
            if inst_gene in term_list:
                inst_found = 'True'
                
            inst_cat = inst_term_name + ': ' + inst_found + '<p> ' + inst_term_pval + '</p>'
            new_row = new_row + (inst_cat,)

        new_rows.append(new_row)
        
    df_cat = deepcopy(df_ini)
    df_cat.index = new_rows
    
    return df_cat, ser_pval, df_term
In [ ]:
df_tmp.shape
In [ ]:
len(all_genes)
In [ ]:
df_var.index.tolist()[0]
In [ ]:
df_cat, ser_pval, df_term = enrich_dataframe_using_lib(gmts['go-process'], df_var, all_genes, 10)
In [ ]:
df_cat.shape

Compare Local vs Enrichrgram Enrichment

In [ ]:
df_cat.index.tolist()[0]
In [ ]:
df_enr.index.tolist()[0]

Local Enrichment

In [ ]:
net.load_df(df_cat)
net.widget()
In [ ]:
genes_1k = [x[0] for x in df.index.tolist()]
In [ ]:
genes_1k[:10]
In [ ]:
df_var.index.tolist()[:10]
In [ ]:
ser_pval, df_term = enrich_gene_list_using_lib(gmts['go-process'], df_var.index.tolist(), all_genes)
df_term.shape
In [ ]:
ser_pval, df_term = enrich_gene_list_using_lib(gmts['kea'], df_var.index.tolist(), all_genes)
df_term.shape
In [ ]:
df_term.head()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
df_term.shape
In [ ]:
genes_1k = [x[0] for x in rows]

KEA

In [ ]:
ser_pval, df_term = enrich_gene_list_using_lib(gmts['kea'], genes_1k, all_genes)
In [ ]:
keep_top_enr = ser_pval.index.tolist()[:10]
In [ ]:
net.load_df(df_term[keep_top_enr])
net.widget()

GO Biological Process

In [ ]:
rows[:10]
In [ ]:
len(all_genes)
In [ ]:
len(genes_top_var)
In [ ]:
ser_pval, df_term = enrich_gene_list_using_lib(gmts['go-process'], genes_top_var, all_genes, 0.01)
df_term.shape

Terms from Enrichrgram

In [ ]:
enrichrgram_row

Locally Enriched Terms

In [ ]:
keep_top_enr = ser_pval.index.tolist()[:10]
keep_top_enr = [x[0] + ' ' + x[1] for x in keep_top_enr]
keep_top_enr
In [ ]:
net.load_df(df_term[keep_top_enr])
net.widget()
In [ ]:
len(gene_list)
In [ ]:
 
In [ ]:
df_sig, keep_genes_dict, df_gene_pval, fold_info = cby.generate_signatures(df,
                                                                     'tissue', num_top_dims=100)

net.load_df(df_sig)
net.widget()

Add tissue category to genes

In [ ]:
gene_sig = df_sig.idxmax(axis=1)
gs_dict = {}
for inst_gene in gene_sig.index.tolist():
    gs_dict[inst_gene] = gene_sig[inst_gene][0]
    
rows = df.index.tolist()
new_rows = [(x, 'Cell Type: ' + gs_dict[x]) if x in gs_dict else (x, 'N.A.') for x in rows ]
df.index = new_rows
net.load_df(df)
In [ ]:
ct_color = net.viz['cat_colors']['col']['cat-0']
In [ ]:
def set_cat_colors(axis, cat_index, cat_title=False):
    for inst_ct in ct_color:
        if cat_title != False:
            cat_name = cat_title + ': ' + inst_ct
        else:
            cat_name = inst_ct
            
        inst_color = ct_color[inst_ct]
        net.set_cat_color(axis=axis, cat_index=cat_index, cat_name=cat_name, inst_color=inst_color)
In [ ]:
set_cat_colors('row', 1)

CCLE Data with Gene-Tissue Category

In [ ]:
net.load_df(df)
net.widget()
In [ ]: